1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
samples <- read_csv(here("doc/samples_final.csv"))
## Rows: 48 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ScilifeID, SubmittedID, Stages, Description, ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2 Raw data

filelist <- list.files(here("data/salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Sanity check to ensure that the data is sorted according to the sample info

filelist <- filelist[match(samples$ScilifeID,sub("_sortmerna.*","",basename(dirname(filelist))))]

stopifnot(all(match(sub("_sortmerna.*","",basename(dirname(filelist))),
                    samples$ScilifeID) == 1:nrow(samples)))

name the file list vector

names(filelist) <- samples$ID

Read the expression at the gene level

counts <- suppressMessages(round(tximport(files = filelist, 
                                          type = "salmon",
                                          txOut=TRUE)$counts))

combine technical replicates

samples$ID <- sub("_L00[1,2]", "",
                  samples$ScilifeID)
counts <- do.call(
  cbind,
  lapply(split.data.frame(t(counts),
                          samples$ID),
         colSums))

csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$ID),]

2.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "19.8% percent (13135) of 66360 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by Stages.
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(csamples)

ggplot(dat,aes(x,y,fill=csamples$Stages)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 13135 rows containing non-finite values (`stat_density()`).

The same is done for the individual samples colored by Stages.

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Stages=csamples$Stages[match(ind,csamples$ID)])

ggplot(dat,aes(x=values,group=ind,col=Stages)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 622137 rows containing non-finite values (`stat_density()`).

2.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_genes_TEs.csv"))

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

csamples$Stages <- factor(csamples$Stages)

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = csamples,
  design = ~Stages)
## converting counts to integer mode
save(dds,file=here("data/analysis/salmon/dds_genes_TEs.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P7614_301_S1 P7614_302_S2 P7614_303_S3 P7614_304_S4 P7614_305_S5
0.997 0.9789 1.096 1.053 1.102
Table continues below
P7614_306_S6 P7614_307_S7 P7614_308_S8 P7614_309_S9 P7614_310_S10
1.311 1.091 1.167 0.9277 0.8292
Table continues below
P7614_311_S11 P7614_312_S12 P7614_313_S13 P7614_314_S14 P7614_315_S15
0.918 0.8462 1.007 0.9453 1.102
Table continues below
P7614_316_S16 P7614_317_S17 P7614_318_S18 P7614_319_S19 P7614_320_S20
1.01 0.9052 0.9999 0.9703 0.9241
P7614_321_S21 P7614_322_S22 P7614_323_S23 P7614_324_S24
1.119 1.072 0.9857 0.9988
boxplot(sizes, main="Sequencing libraries size factor")

3.2 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
save(vst,file=here("data/analysis/DE/vst-blind_genes_TEs.rda"))

3.3 Variance Stabilising Transformation

vsda <- varianceStabilizingTransformation(dds, blind=FALSE)
vsta <- assay(vsda)
vsta <- vsta - min(vsta)
save(vsta,file=here("data/analysis/DE/vst-aware_genes_TEs.rda"))

# prepare the data to build the network
#ID <- rownames(vsta)
#vsta <- cbind(ID,vsta)
#vsta_tibble <- as_tibble(vsta)
#write_tsv(vsta_tibble,path=here("data/analysis/DE/vst-aware_genes_TEs.tsv"))
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vsta[rowSums(vsta)>0,])

3.4 QC on the normalised data

3.4.1 PCA

#load(here("data/analysis/salmon/dds_genes_TEs.rda"))
#load(here("data/analysis/DE/vst-aware_genes_TEs.rda"))
pc <- prcomp(t(vsta))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=1

An the number of possible combinations

nlevel=nlevels(dds$Stages) 

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

3.4.2 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    csamples)

m <- ggplot(pc.dat,aes(x=PC2,y=PC1,col=Stages,text=dds$ID)) + 
  geom_point(size=2) +
  theme_classic() +
  ggtitle("Principal Component Analysis coding") +
          #,subtitle="variance stabilized counts") 
  labs(x=paste("PC1 (",percent[1],"%)",sep=""),
       y=paste("PC2 (",percent[2],"%)",sep="")) +
  theme(text=element_text(size=12)) +
  #theme(plot.title=element_text(size=20)) +
  theme(plot.title = element_text(face = "bold"))
m

plot(m + labs(x=paste("PC1 (",percent[1],"%)",sep=""),
              y=paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(m) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

3.4.3 Heatmap

Filter for noise

conds <- factor(csamples$Stages)
sels <- rangeFeatureSelect(counts=vsta,
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

vst.cutoff <- 1
  • Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(vsta[sels[[vst.cutoff+1]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds)

  • Biological QA only on TEs
#load(here("data/analysis/DE/vst-aware_genes_TEs.rda"))
TEs <- vsta[grepl("^MA_", rownames(vsta)) == FALSE, ]

4 PCA of only TEs (subsetted data)

pc_TEs <- prcomp(t(vsta[grepl("^MA_", rownames(vsta)) == FALSE, ]))
percent_TEs <- round(summary(pc_TEs)$importance[2,]*100)

4.1 2D

pc.dat_TEs <- bind_cols(PC1=pc_TEs$x[,1],
                    PC2=pc_TEs$x[,2],
                    csamples)

p <- ggplot(pc.dat_TEs,aes(x=PC1,y=PC2,col=Stages,text=dds$ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent_TEs[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent_TEs[2],"%)",sep="")))

4.1.1 Heatmap

Filter for noise

conds_TEs <- factor(csamples$Stages)
sels_TEs <- rangeFeatureSelect(counts=TEs,
                           conditions=conds_TEs,
                           nrep=3)

vst.cutoff <- 1
  • Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(TEs[sels_TEs[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds_TEs,
                col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds_TEs)

hm2 <- heatmap.2(TEs, 
          scale = "row", 
          labRow = NULL, 
          labCol = conds_TEs,
          trace = "none",
          col=hpal)

plot(as.hclust(hm2$colDendrogram),xlab="",sub="",labels=conds_TEs)

4.2 Conclusion

# The Biological QA is good.
# The sequencing depth is good. Also looking at the PCAs we don't have any outliers.

5 Session Info

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.66.0                  tximport_1.26.1            
##  [3] lubridate_1.9.2             forcats_1.0.0              
##  [5] stringr_1.5.0               dplyr_1.1.1                
##  [7] purrr_1.0.1                 readr_2.1.4                
##  [9] tidyr_1.3.0                 tibble_3.2.1               
## [11] tidyverse_2.0.0             plotly_4.10.1              
## [13] pander_0.6.5                hyperSpec_0.100.0          
## [15] xml2_1.3.4                  ggplot2_3.4.2              
## [17] lattice_0.20-45             here_1.0.1                 
## [19] gplots_3.1.3                DESeq2_1.38.3              
## [21] SummarizedExperiment_1.28.0 Biobase_2.58.0             
## [23] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [25] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
## [27] IRanges_2.32.0              S4Vectors_0.36.2           
## [29] BiocGenerics_0.44.0         data.table_1.14.8          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.1-0       deldir_1.0-6           ellipsis_0.3.2        
##  [4] rprojroot_2.0.3        XVector_0.38.0         rstudioapi_0.14       
##  [7] hexbin_1.28.3          farver_2.1.1           affyio_1.68.0         
## [10] bit64_4.0.5            AnnotationDbi_1.60.2   fansi_1.0.4           
## [13] codetools_0.2-19       cachem_1.0.8           geneplotter_1.76.0    
## [16] knitr_1.42             jsonlite_1.8.4         annotate_1.76.0       
## [19] png_0.1-8              BiocManager_1.30.20    compiler_4.2.0        
## [22] httr_1.4.6             Matrix_1.5-3           fastmap_1.1.1         
## [25] lazyeval_0.2.2         limma_3.54.2           cli_3.6.1             
## [28] htmltools_0.5.5        tools_4.2.0            gtable_0.3.3          
## [31] glue_1.6.2             GenomeInfoDbData_1.2.9 affy_1.76.0           
## [34] Rcpp_1.0.10            jquerylib_0.1.4        vctrs_0.6.1           
## [37] Biostrings_2.66.0      preprocessCore_1.60.2  crosstalk_1.2.0       
## [40] xfun_0.38              brio_1.1.3             testthat_3.1.7        
## [43] timechange_0.2.0       lifecycle_1.0.3        gtools_3.9.4          
## [46] XML_3.99-0.14          zlibbioc_1.44.0        scales_1.2.1          
## [49] vroom_1.6.1            hms_1.1.3              RColorBrewer_1.1-3    
## [52] yaml_2.3.7             memoise_2.0.1          sass_0.4.6            
## [55] latticeExtra_0.6-30    stringi_1.7.12         RSQLite_2.3.1         
## [58] highr_0.10             caTools_1.18.2         BiocParallel_1.32.6   
## [61] rlang_1.1.0            pkgconfig_2.0.3        bitops_1.0-7          
## [64] evaluate_0.21          htmlwidgets_1.6.2      labeling_0.4.2        
## [67] bit_4.0.5              tidyselect_1.2.0       magrittr_2.0.3        
## [70] R6_2.5.1               generics_0.1.3         DelayedArray_0.24.0   
## [73] DBI_1.1.3              pillar_1.9.0           withr_2.5.0           
## [76] KEGGREST_1.38.0        RCurl_1.98-1.12        crayon_1.5.2          
## [79] interp_1.1-4           KernSmooth_2.23-20     utf8_1.2.3            
## [82] tzdb_0.3.0             rmarkdown_2.21         jpeg_0.1-10           
## [85] locfit_1.5-9.7         blob_1.2.4             digest_0.6.31         
## [88] xtable_1.8-4           munsell_0.5.0          viridisLite_0.4.2     
## [91] bslib_0.4.2